In [ ]:
# Basic libraries import
import numpy as np
import pandas as pd
import seaborn as sns
import collections
import itertools
import random
import sys
import os
from os.path import join
from pathlib import Path
# Plotting
%matplotlib notebook
%matplotlib inline
sns.set_context("notebook", font_scale=1.5)
In [ ]:
data_folder = join(str(Path.home()), "Documents/datasets/")
In [ ]:
sentences = ["A brown fox jumped on the lazy dog",
"A brown fox jumped on the brown duck",
"A brown fox jumped on the lazy elephant",
"An elephant is eating green grass near the alpaca",
"A green alpaca tried to jump over an elephant",
"May you rest in a deep and dreamless slumber"]
In [ ]:
# dummy tokenization
tokenized_sentences = [sent.strip().split() for sent in sentences]
# word to index
counter = collections.Counter(itertools.chain(*tokenized_sentences))
vocab = counter.most_common()
index_to_word = [x[0] for x in vocab]
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])
In [ ]:
import gensim, logging
In [ ]:
# parameters
size = 200 # size of NN layers, corresponding to word vector dimensionality
min_count = 1 # minimum word count in order to consider such word
workers = 4 # number of threads to run in parallel (only effect if you have Cython installed)
window = 10 # Context window size
downsampling = 1e-3 # Downsample setting for frequent words
In [ ]:
print("Training model...")
model = gensim.models.Word2Vec([s.split() for s in sentences],
workers=workers,
size=size, min_count = min_count,
window = window, sample = downsampling)
In [ ]:
# If you don't plan to train the model any further, calling
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)
In [ ]:
# save model
model_name = "w2v_{}_size{}_mincount{}_window{}".format(corpus_name, size, min_count, window)
model.save(model_name)
In [ ]:
# load model
model = gensim.models.Word2Vec.load(model_name)
In [ ]:
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
model.doesnt_match("breakfast cereal dinner lunch";.split())
model.similarity('woman', 'man')
In [ ]:
# sentence to tensor
model[['brown', 'fox']].shape
In [ ]:
import nltk
import itertools
import collections
nltk.FreqDist(itertools.chain(*[s.split() for s in sentences]))
In [ ]:
collections.Counter(itertools.chain(*[s.split() for s in sentences]))
In [ ]:
# load GloVe embeddings
EMBEDDING_DIM = 100
embeddings = {}
with open(join(data_folder, "glove", "glove.6B.100d.txt")) as glove:
for line in glove:
values = line.strip().split()
word = values[0]
vector = np.asarray(values[1:], dtype='float32')
embeddings[word] = vector
In [ ]:
embeddings['objected']
In [ ]:
for i, (k, v) in enumerate(embeddings.items()):
if k=='objected':
print(i)
In [ ]:
# create embedding matrix
embeddings_matrix = np.zeros((len(word_to_index)+1, EMBEDDING_DIM))
for word, i in word_to_index.items():
if word in embeddings:
embeddings_matrix[i] = embeddings[word]
In [ ]: